import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from pandas.plotting import scatter_matrix
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
file_path = "C:/PERSONAL/Personal/PROJ/NTI/FFFIIINNNAALLL/WideWorld.xlsx"
Graduation_Data = pd.ExcelFile(file_path)
# Check available sheets in the Excel file
sheet_names = Graduation_Data.sheet_names
# Load data from the main sheet
grad_data = Graduation_Data.parse(sheet_names[0])
print(grad_data.head())
CityID CityName StateProvinceID StateProvinceName CountryID \
0 9077 Dickworsham 45 Texas 230
1 26752 Petronila 45 Texas 230
2 9857 East Mountain 45 Texas 230
3 8375 Dallardsville 45 Texas 230
4 29320 Rockwall 45 Texas 230
CountryName customerID CustomerName \
0 United States 599 Wingtip Toys (Dickworsham, TX)
1 United States 814 Johanna Hoornstra
2 United States 530 Wingtip Toys (East Mountain, TX)
3 United States 34 Tailspin Toys (Dallardsville, TX)
4 United States 408 Wingtip Toys (Rockwall, TX)
CustomerCategoryID CustomerCategoryName ... InvoiceID InvoiceLineID \
0 3 Novelty Shop ... 985 3025
1 3 Novelty Shop ... 1293 4037
2 3 Novelty Shop ... 1518 4811
3 3 Novelty Shop ... 1719 5512
4 3 Novelty Shop ... 1813 5831
Quantity UnitPrice TaxAmount TaxRate LineProfit ExtendedPrice \
0 9 13.0 17.55 15 76.5 134.55
1 9 13.0 17.55 15 76.5 134.55
2 9 13.0 17.55 15 76.5 134.55
3 9 13.0 17.55 15 76.5 134.55
4 9 13.0 17.55 15 76.5 134.55
StockItemID StockItemName
0 39 Developer joke mug - inheritance is the OO way...
1 40 Developer joke mug - (hip, hip, array) (White)
2 47 Developer joke mug - a foo walks into a bar (B...
3 36 Developer joke mug - when your hammer is C++ (...
4 40 Developer joke mug - (hip, hip, array) (White)
[5 rows x 23 columns]
print(grad_data.columns)
Index(['CityID', 'CityName', 'StateProvinceID', 'StateProvinceName',
'CountryID', 'CountryName', 'customerID', 'CustomerName',
'CustomerCategoryID', 'CustomerCategoryName', 'DeliveryMethodID',
'DeliveryMethodName', 'InvoiceDate', 'InvoiceID', 'InvoiceLineID',
'Quantity', 'UnitPrice', 'TaxAmount', 'TaxRate', 'LineProfit',
'ExtendedPrice', 'StockItemID', 'StockItemName'],
dtype='object')
print(grad_data.describe())
CityID StateProvinceID CountryID customerID \
count 228265.000000 228265.000000 228265.0 228265.000000
mean 18968.037803 27.464425 230.0 528.712383
std 10991.856489 14.997132 0.0 343.722468
min 15.000000 1.000000 230.0 1.000000
25% 9331.000000 15.000000 230.0 160.000000
50% 18882.000000 31.000000 230.0 518.000000
75% 28434.000000 39.000000 230.0 877.000000
max 38184.000000 53.000000 230.0 1061.000000
CustomerCategoryID DeliveryMethodID InvoiceID InvoiceLineID \
count 228265.000000 228265.0 228265.000000 228265.000000
mean 3.692038 3.0 35179.209686 114133.000000
std 1.250669 0.0 20337.208453 65894.573936
min 3.000000 3.0 1.000000 1.000000
25% 3.000000 3.0 17572.000000 57067.000000
50% 3.000000 3.0 35152.000000 114133.000000
75% 4.000000 3.0 52765.000000 171199.000000
max 7.000000 3.0 70510.000000 228265.000000
Quantity UnitPrice TaxAmount TaxRate \
count 228265.000000 228265.000000 228265.000000 228265.000000
mean 39.211566 45.591689 112.948101 14.977307
std 54.558829 139.862055 217.512539 0.336081
min 1.000000 0.660000 0.380000 10.000000
25% 5.000000 13.000000 14.400000 15.000000
50% 10.000000 18.000000 34.500000 15.000000
75% 60.000000 32.000000 129.600000 15.000000
max 360.000000 1899.000000 2848.500000 15.000000
LineProfit ExtendedPrice StockItemID
count 228265.000000 228265.000000 228265.000000
mean 375.568663 867.603178 110.181285
std 754.052045 1668.036182 63.729035
min -645.000000 2.880000 1.000000
25% 51.000000 110.400000 54.000000
50% 120.000000 264.500000 111.000000
75% 390.000000 993.600000 165.000000
max 9200.000000 21838.500000 227.000000
print(grad_data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 228265 entries, 0 to 228264 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CityID 228265 non-null int64 1 CityName 228265 non-null object 2 StateProvinceID 228265 non-null int64 3 StateProvinceName 228265 non-null object 4 CountryID 228265 non-null int64 5 CountryName 228265 non-null object 6 customerID 228265 non-null int64 7 CustomerName 228265 non-null object 8 CustomerCategoryID 228265 non-null int64 9 CustomerCategoryName 228265 non-null object 10 DeliveryMethodID 228265 non-null int64 11 DeliveryMethodName 228265 non-null object 12 InvoiceDate 228265 non-null datetime64[ns] 13 InvoiceID 228265 non-null int64 14 InvoiceLineID 228265 non-null int64 15 Quantity 228265 non-null int64 16 UnitPrice 228265 non-null float64 17 TaxAmount 228265 non-null float64 18 TaxRate 228265 non-null int64 19 LineProfit 228265 non-null float64 20 ExtendedPrice 228265 non-null float64 21 StockItemID 228265 non-null int64 22 StockItemName 228265 non-null object dtypes: datetime64[ns](1), float64(4), int64(11), object(7) memory usage: 40.1+ MB None
print(grad_data.isnull().sum())
CityID 0 CityName 0 StateProvinceID 0 StateProvinceName 0 CountryID 0 CountryName 0 customerID 0 CustomerName 0 CustomerCategoryID 0 CustomerCategoryName 0 DeliveryMethodID 0 DeliveryMethodName 0 InvoiceDate 0 InvoiceID 0 InvoiceLineID 0 Quantity 0 UnitPrice 0 TaxAmount 0 TaxRate 0 LineProfit 0 ExtendedPrice 0 StockItemID 0 StockItemName 0 dtype: int64
data_cleaned = grad_data.dropna()
data_cleaned = data_cleaned.drop_duplicates()
data_cleaned['InvoiceDate'] = pd.to_datetime(data_cleaned['InvoiceDate'])
# 1. Total Price: Total cost per line item
data_cleaned['TotalPrice'] = data_cleaned['Quantity'] * data_cleaned['UnitPrice']
# 2. Profit Margin: Profit percentage based on LineProfit and Total Price
data_cleaned['ProfitMargin'] = (data_cleaned['LineProfit'] / data_cleaned['TotalPrice']) * 100
# 3. Tax per Item: Tax per item based on TaxAmount and Quantity
data_cleaned['TaxPerItem'] = data_cleaned['TaxAmount'] / data_cleaned['Quantity']
# 4. Revenue per Customer: Total price per customer for each transaction
data_cleaned['RevenuePerCustomer'] = data_cleaned.groupby('customerID')['TotalPrice'].transform('mean')
# 5. Total Items per Invoice: Total quantity of items per invoice
data_cleaned['ItemsPerInvoice'] = data_cleaned.groupby('InvoiceID')['Quantity'].transform('sum')
# 6. Invoice Count per Customer: Total number of invoices each customer has
data_cleaned['InvoicesPerCustomer'] = data_cleaned.groupby('customerID')['InvoiceID'].transform('nunique')
# 7. Average Unit Price by Product: Average selling price of each product
data_cleaned['AvgUnitPriceProduct'] = data_cleaned.groupby('StockItemID')['UnitPrice'].transform('mean')
# 8. Average Quantity per Product Sold: Average quantity sold per product across invoices
data_cleaned['AvgQuantityProduct'] = data_cleaned.groupby('StockItemID')['Quantity'].transform('mean')
# 9. Profit per Product: Total profit made per product
data_cleaned['ProfitPerProduct'] = data_cleaned.groupby('StockItemID')['LineProfit'].transform('sum')
# 10. Cumulative Sales by Customer: Cumulative sales over time for each customer
data_cleaned['CumulativeSalesCustomer'] = data_cleaned.sort_values('InvoiceID').groupby('customerID')['TotalPrice'].cumsum()
# 11. Sales Contribution by Product Category: Contribution of each product category to total sales
data_cleaned['CategorySalesContribution'] = data_cleaned.groupby('CustomerCategoryID')['TotalPrice'].transform('sum') / data_cleaned['TotalPrice'].sum()
# 12. High Value Customer Flag: Flag to mark customers with above-average cumulative sales
average_cumulative_sales = data_cleaned['CumulativeSalesCustomer'].mean()
data_cleaned['HighValueCustomer'] = data_cleaned['CumulativeSalesCustomer'] > average_cumulative_sales
# 13. Year and Month of Sale: Extracting year and month from the date
data_cleaned['InvoiceYear'] = data_cleaned['InvoiceDate'].dt.year
data_cleaned['InvoiceMonth'] = data_cleaned['InvoiceDate'].dt.month
# 14. Profit per Invoice: Sum of profit per invoice
data_cleaned['ProfitPerInvoice'] = data_cleaned.groupby('InvoiceID')['LineProfit'].transform('sum')
# Display the updated DataFrame with new features
print(data_cleaned.head())
CityID CityName StateProvinceID StateProvinceName CountryID \
0 9077 Dickworsham 45 Texas 230
1 26752 Petronila 45 Texas 230
2 9857 East Mountain 45 Texas 230
3 8375 Dallardsville 45 Texas 230
4 29320 Rockwall 45 Texas 230
CountryName customerID CustomerName \
0 United States 599 Wingtip Toys (Dickworsham, TX)
1 United States 814 Johanna Hoornstra
2 United States 530 Wingtip Toys (East Mountain, TX)
3 United States 34 Tailspin Toys (Dallardsville, TX)
4 United States 408 Wingtip Toys (Rockwall, TX)
CustomerCategoryID CustomerCategoryName ... InvoicesPerCustomer \
0 3 Novelty Shop ... 104
1 3 Novelty Shop ... 89
2 3 Novelty Shop ... 103
3 3 Novelty Shop ... 105
4 3 Novelty Shop ... 101
AvgUnitPriceProduct AvgQuantityProduct ProfitPerProduct \
0 13.0 5.616728 51943.5
1 13.0 5.559229 51459.0
2 13.0 5.446111 49393.5
3 13.0 5.397590 49504.0
4 13.0 5.559229 51459.0
CumulativeSalesCustomer CategorySalesContribution HighValueCustomer \
0 285.00 0.715309 False
1 2666.75 0.715309 False
2 6593.00 0.715309 False
3 6869.10 0.715309 False
4 4082.00 0.715309 False
InvoiceYear InvoiceMonth ProfitPerInvoice
0 2013 1 447.35
1 2013 1 987.50
2 2013 1 777.50
3 2013 2 476.50
4 2013 2 1169.00
[5 rows x 38 columns]
# 1. Total Sales by Customer
sales_by_customer = data_cleaned.groupby('customerID')['TotalPrice'].sum().reset_index()
sales_by_customer.columns = ['CustomerID', 'TotalSales']
sales_by_customer
| CustomerID | TotalSales | |
|---|---|---|
| 0 | 1 | 305494.40 |
| 1 | 2 | 229142.90 |
| 2 | 3 | 307461.05 |
| 3 | 4 | 299127.85 |
| 4 | 5 | 257129.40 |
| ... | ... | ... |
| 658 | 1057 | 46772.25 |
| 659 | 1058 | 54552.40 |
| 660 | 1059 | 13208.10 |
| 661 | 1060 | 7240.20 |
| 662 | 1061 | 22829.65 |
663 rows × 2 columns
# 2. Average Profit Margin by Customer Category
avg_profit_margin_category = data_cleaned.groupby('CustomerCategoryID')['ProfitMargin'].mean().reset_index()
avg_profit_margin_category.columns = ['CustomerCategoryID', 'AverageProfitMargin']
avg_profit_margin_category
| CustomerCategoryID | AverageProfitMargin | |
|---|---|---|
| 0 | 3 | 53.458527 |
| 1 | 4 | 54.341259 |
| 2 | 5 | 54.490082 |
| 3 | 6 | 54.836917 |
| 4 | 7 | 54.635586 |
# 3. Total Quantity Sold by Product
total_quantity_product = data_cleaned.groupby('StockItemID')['Quantity'].sum().reset_index()
total_quantity_product.columns = ['StockItemID', 'TotalQuantitySold']
total_quantity_product
| StockItemID | TotalQuantitySold | |
|---|---|---|
| 0 | 1 | 5869 |
| 1 | 2 | 6048 |
| 2 | 3 | 56160 |
| 3 | 4 | 5826 |
| 4 | 5 | 5571 |
| ... | ... | ... |
| 222 | 223 | 15504 |
| 223 | 224 | 14976 |
| 224 | 225 | 15840 |
| 225 | 226 | 18408 |
| 226 | 227 | 16056 |
227 rows × 2 columns
# 4. Average Sales per Invoice
avg_sales_invoice = data_cleaned.groupby('InvoiceID')['TotalPrice'].mean().reset_index()
avg_sales_invoice.columns = ['InvoiceID', 'AverageSales']
avg_sales_invoice
| InvoiceID | AverageSales | |
|---|---|---|
| 0 | 1 | 2300.000000 |
| 1 | 2 | 202.500000 |
| 2 | 3 | 90.000000 |
| 3 | 4 | 148.400000 |
| 4 | 5 | 234.666667 |
| ... | ... | ... |
| 70505 | 70506 | 308.500000 |
| 70506 | 70507 | 1571.666667 |
| 70507 | 70508 | 794.000000 |
| 70508 | 70509 | 505.200000 |
| 70509 | 70510 | 965.200000 |
70510 rows × 2 columns
# 5. Total Profit by Product Category
total_profit_category = data_cleaned.groupby('CustomerCategoryID')['LineProfit'].sum().reset_index()
total_profit_category.columns = ['CustomerCategoryID', 'TotalProfit']
total_profit_category
| CustomerCategoryID | TotalProfit | |
|---|---|---|
| 0 | 3 | 61208873.50 |
| 1 | 4 | 7117143.55 |
| 2 | 5 | 5780912.10 |
| 3 | 6 | 5873730.00 |
| 4 | 7 | 5748521.75 |
# 6. Customer Lifetime Sales (Cumulative Sales)
customer_lifetime_sales = data_cleaned.groupby('customerID')['TotalPrice'].sum().reset_index()
customer_lifetime_sales.columns = ['CustomerID', 'LifetimeSales']
customer_lifetime_sales
| CustomerID | LifetimeSales | |
|---|---|---|
| 0 | 1 | 305494.40 |
| 1 | 2 | 229142.90 |
| 2 | 3 | 307461.05 |
| 3 | 4 | 299127.85 |
| 4 | 5 | 257129.40 |
| ... | ... | ... |
| 658 | 1057 | 46772.25 |
| 659 | 1058 | 54552.40 |
| 660 | 1059 | 13208.10 |
| 661 | 1060 | 7240.20 |
| 662 | 1061 | 22829.65 |
663 rows × 2 columns
# 7. Monthly Sales Trend Peer Each Year
monthly_sales_trend = data_cleaned.groupby(data_cleaned['InvoiceDate'].dt.to_period('M'))['TotalPrice'].sum().reset_index()
monthly_sales_trend.columns = ['Month', 'MonthlySales']
monthly_sales_trend
| Month | MonthlySales | |
|---|---|---|
| 0 | 2013-01 | 3770410.85 |
| 1 | 2013-02 | 2776786.20 |
| 2 | 2013-03 | 3870505.30 |
| 3 | 2013-04 | 4059606.85 |
| 4 | 2013-05 | 4417965.55 |
| 5 | 2013-06 | 4069036.20 |
| 6 | 2013-07 | 4381767.45 |
| 7 | 2013-08 | 3495991.00 |
| 8 | 2013-09 | 3779040.85 |
| 9 | 2013-10 | 3752608.45 |
| 10 | 2013-11 | 3697461.90 |
| 11 | 2013-12 | 3636007.40 |
| 12 | 2014-01 | 4067538.00 |
| 13 | 2014-02 | 3470209.20 |
| 14 | 2014-03 | 3861928.75 |
| 15 | 2014-04 | 4095234.65 |
| 16 | 2014-05 | 4590639.10 |
| 17 | 2014-06 | 4266644.10 |
| 18 | 2014-07 | 4786301.05 |
| 19 | 2014-08 | 4085489.60 |
| 20 | 2014-09 | 3882968.85 |
| 21 | 2014-10 | 4438683.65 |
| 22 | 2014-11 | 4018967.45 |
| 23 | 2014-12 | 4364882.80 |
| 24 | 2015-01 | 4401699.25 |
| 25 | 2015-02 | 4195319.25 |
| 26 | 2015-03 | 4528131.65 |
| 27 | 2015-04 | 5073264.75 |
| 28 | 2015-05 | 4480730.55 |
| 29 | 2015-06 | 4515840.45 |
| 30 | 2015-07 | 5155672.00 |
| 31 | 2015-08 | 3938163.40 |
| 32 | 2015-09 | 4662600.00 |
| 33 | 2015-10 | 4492049.40 |
| 34 | 2015-11 | 4089208.50 |
| 35 | 2015-12 | 4458811.25 |
| 36 | 2016-01 | 4447705.95 |
| 37 | 2016-02 | 4005616.85 |
| 38 | 2016-03 | 4645254.00 |
| 39 | 2016-04 | 4563666.10 |
| 40 | 2016-05 | 4970932.65 |
# 8. Top 5 Products by Total Sales
top_products_sales = data_cleaned.groupby('StockItemID')['TotalPrice'].sum().nlargest(5).reset_index()
top_products_sales.columns = ['StockItemID', 'TotalSales']
top_products_sales
| StockItemID | TotalSales | |
|---|---|---|
| 0 | 215 | 11107251.0 |
| 1 | 173 | 6384000.0 |
| 2 | 167 | 6329070.0 |
| 3 | 161 | 6214320.0 |
| 4 | 164 | 6190240.0 |
# 9. Number of Unique Products Sold by Each Customer
unique_products_per_customer = data_cleaned.groupby('customerID')['StockItemID'].nunique().reset_index()
unique_products_per_customer.columns = ['CustomerID', 'UniqueProductsPurchased']
unique_products_per_customer
| CustomerID | UniqueProductsPurchased | |
|---|---|---|
| 0 | 1 | 189 |
| 1 | 2 | 178 |
| 2 | 3 | 191 |
| 3 | 4 | 172 |
| 4 | 5 | 178 |
| ... | ... | ... |
| 658 | 1057 | 51 |
| 659 | 1058 | 46 |
| 660 | 1059 | 28 |
| 661 | 1060 | 14 |
| 662 | 1061 | 29 |
663 rows × 2 columns
# 10. Sales Contribution per Region
sales_contribution_region = data_cleaned.groupby('CountryID')['TotalPrice'].sum().reset_index()
sales_contribution_region['SalesContribution'] = sales_contribution_region['TotalPrice'] / sales_contribution_region['TotalPrice'].sum()
sales_contribution_region.columns = ['CountryID', 'TotalSales', 'SalesContribution']
sales_contribution_region
| CountryID | TotalSales | SalesContribution | |
|---|---|---|---|
| 0 | 230 | 172261341.2 | 1.0 |
# 11. Average Order Size by Customer Category
avg_order_size_category = data_cleaned.groupby('CustomerCategoryID')['TotalPrice'].mean().reset_index()
avg_order_size_category.columns = ['CustomerCategoryID', 'AverageOrderSize']
avg_order_size_category
| CustomerCategoryID | AverageOrderSize | |
|---|---|---|
| 0 | 3 | 755.941708 |
| 1 | 4 | 765.566468 |
| 2 | 5 | 741.142890 |
| 3 | 6 | 748.278090 |
| 4 | 7 | 747.850271 |
# 12. Number of Transactions per Product
transaction_count_per_product = data_cleaned.groupby('StockItemID')['InvoiceID'].nunique().reset_index()
transaction_count_per_product.columns = ['StockItemID', 'TransactionCount']
transaction_count_per_product
| StockItemID | TransactionCount | |
|---|---|---|
| 0 | 1 | 1048 |
| 1 | 2 | 1078 |
| 2 | 3 | 1022 |
| 3 | 4 | 1066 |
| 4 | 5 | 1046 |
| ... | ... | ... |
| 222 | 223 | 115 |
| 223 | 224 | 117 |
| 224 | 225 | 126 |
| 225 | 226 | 150 |
| 226 | 227 | 117 |
227 rows × 2 columns
# 13. Average Profit per Transaction by Customer
avg_profit_per_customer = data_cleaned.groupby('customerID')['LineProfit'].mean().reset_index()
avg_profit_per_customer.columns = ['CustomerID', 'AverageProfitPerTransaction']
avg_profit_per_customer
| CustomerID | AverageProfitPerTransaction | |
|---|---|---|
| 0 | 1 | 362.623522 |
| 1 | 2 | 309.300000 |
| 2 | 3 | 367.718879 |
| 3 | 4 | 456.230538 |
| 4 | 5 | 342.520964 |
| ... | ... | ... |
| 658 | 1057 | 419.571818 |
| 659 | 1058 | 601.868000 |
| 660 | 1059 | 211.093333 |
| 661 | 1060 | 217.360000 |
| 662 | 1061 | 350.167241 |
663 rows × 2 columns
# 14. Product Performance by Region
# Aggregating total sales of each product by region (e.g., CountryID)
product_performance_region = data_cleaned.groupby(['CountryID', 'StockItemID'])['TotalPrice'].sum().reset_index()
product_performance_region.columns = ['CountryID', 'StockItemID', 'TotalSales']
product_performance_region
| CountryID | StockItemID | TotalSales | |
|---|---|---|---|
| 0 | 230 | 1 | 141342.50 |
| 1 | 230 | 2 | 146523.75 |
| 2 | 230 | 3 | 1038960.00 |
| 3 | 230 | 4 | 180459.20 |
| 4 | 230 | 5 | 174640.00 |
| ... | ... | ... | ... |
| 222 | 230 | 223 | 132559.20 |
| 223 | 230 | 224 | 128044.80 |
| 224 | 230 | 225 | 135432.00 |
| 225 | 230 | 226 | 157388.40 |
| 226 | 230 | 227 | 137278.80 |
227 rows × 3 columns
# 15. Most Frequently Purchased Product per Customer
most_frequent_product_customer = data_cleaned.groupby('customerID')['StockItemID'].agg(lambda x: x.value_counts().index[0]).reset_index()
most_frequent_product_customer.columns = ['CustomerID', 'MostFrequentProduct']
most_frequent_product_customer
| CustomerID | MostFrequentProduct | |
|---|---|---|
| 0 | 1 | 22 |
| 1 | 2 | 177 |
| 2 | 3 | 162 |
| 3 | 4 | 195 |
| 4 | 5 | 19 |
| ... | ... | ... |
| 658 | 1057 | 223 |
| 659 | 1058 | 115 |
| 660 | 1059 | 1 |
| 661 | 1060 | 212 |
| 662 | 1061 | 21 |
663 rows × 2 columns
# Display the grouped data summaries
print("Total Sales by Customer:\n", sales_by_customer.head())
print("Average Profit Margin by Customer Category:\n", avg_profit_margin_category.head())
print("Total Quantity Sold by Product:\n", total_quantity_product.head())
print("Average Sales per Invoice:\n", avg_sales_invoice.head())
print("Total Profit by Product Category:\n", total_profit_category.head())
print("Customer Lifetime Sales:\n", customer_lifetime_sales.head())
print("Monthly Sales Trend:\n", monthly_sales_trend.head())
print("Top 5 Products by Total Sales:\n", top_products_sales.head())
print("Number of Unique Products Sold by Each Customer:\n", unique_products_per_customer.head())
print("Sales Contribution per Region:\n", sales_contribution_region.head())
print("Average Order Size by Customer Category:\n", avg_order_size_category.head())
print("Number of Transactions per Product:\n", transaction_count_per_product.head())
print("Average Profit per Transaction by Customer:\n", avg_profit_per_customer.head())
print("Product Performance by Region:\n", product_performance_region.head())
print("Most Frequently Purchased Product per Customer:\n", most_frequent_product_customer.head())
Total Sales by Customer:
CustomerID TotalSales
0 1 305494.40
1 2 229142.90
2 3 307461.05
3 4 299127.85
4 5 257129.40
Average Profit Margin by Customer Category:
CustomerCategoryID AverageProfitMargin
0 3 53.458527
1 4 54.341259
2 5 54.490082
3 6 54.836917
4 7 54.635586
Total Quantity Sold by Product:
StockItemID TotalQuantitySold
0 1 5869
1 2 6048
2 3 56160
3 4 5826
4 5 5571
Average Sales per Invoice:
InvoiceID AverageSales
0 1 2300.000000
1 2 202.500000
2 3 90.000000
3 4 148.400000
4 5 234.666667
Total Profit by Product Category:
CustomerCategoryID TotalProfit
0 3 61208873.50
1 4 7117143.55
2 5 5780912.10
3 6 5873730.00
4 7 5748521.75
Customer Lifetime Sales:
CustomerID LifetimeSales
0 1 305494.40
1 2 229142.90
2 3 307461.05
3 4 299127.85
4 5 257129.40
Monthly Sales Trend:
Month MonthlySales
0 2013-01 3770410.85
1 2013-02 2776786.20
2 2013-03 3870505.30
3 2013-04 4059606.85
4 2013-05 4417965.55
Top 5 Products by Total Sales:
StockItemID TotalSales
0 215 11107251.0
1 173 6384000.0
2 167 6329070.0
3 161 6214320.0
4 164 6190240.0
Number of Unique Products Sold by Each Customer:
CustomerID UniqueProductsPurchased
0 1 189
1 2 178
2 3 191
3 4 172
4 5 178
Sales Contribution per Region:
CountryID TotalSales SalesContribution
0 230 172261341.2 1.0
Average Order Size by Customer Category:
CustomerCategoryID AverageOrderSize
0 3 755.941708
1 4 765.566468
2 5 741.142890
3 6 748.278090
4 7 747.850271
Number of Transactions per Product:
StockItemID TransactionCount
0 1 1048
1 2 1078
2 3 1022
3 4 1066
4 5 1046
Average Profit per Transaction by Customer:
CustomerID AverageProfitPerTransaction
0 1 362.623522
1 2 309.300000
2 3 367.718879
3 4 456.230538
4 5 342.520964
Product Performance by Region:
CountryID StockItemID TotalSales
0 230 1 141342.50
1 230 2 146523.75
2 230 3 1038960.00
3 230 4 180459.20
4 230 5 174640.00
Most Frequently Purchased Product per Customer:
CustomerID MostFrequentProduct
0 1 22
1 2 177
2 3 162
3 4 195
4 5 19
# 1. Distribution of Total Price (Histogram)
plt.figure(figsize=(10, 6))
sns.histplot(data_cleaned['TotalPrice'], bins=30, kde=True)
plt.title("Distribution of Total Price")
plt.xlabel("Total Price")
plt.ylabel("Frequency")
plt.show()
# 2. Distribution of Profit Margin (Histogram)
plt.figure(figsize=(10, 6))
sns.histplot(data_cleaned['ProfitMargin'], bins=30, kde=True)
plt.title("Distribution of Profit Margin")
plt.xlabel("Profit Margin (%)")
plt.ylabel("Frequency")
plt.show()
# 3. Count of Transactions per Customer Category (Bar Chart)
plt.figure(figsize=(10, 6))
sns.countplot(x='CustomerCategoryID', data=data_cleaned)
plt.title("Count of Transactions per Customer Category")
plt.xlabel("Customer Category")
plt.ylabel("Transaction Count")
plt.show()
# 4. Number of Items per Invoice (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='ItemsPerInvoice', data=data_cleaned)
plt.title("Distribution of Number of Items per Invoice")
plt.xlabel("Items per Invoice")
plt.show()
# 5. Sales Contribution by Product Category (Bar Chart)
plt.figure(figsize=(10, 6))
sns.barplot(x='CustomerCategoryID', y='CategorySalesContribution', data=data_cleaned)
plt.title("Sales Contribution by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Sales Contribution")
plt.show()
# 6. Distribution of Profit per Invoice (Violin Plot)
plt.figure(figsize=(10, 6))
sns.violinplot(x='ProfitPerInvoice', data=data_cleaned)
plt.title("Distribution of Profit per Invoice")
plt.xlabel("Profit per Invoice")
plt.show()
# 7. Quantity vs. Total Price (Scatter Plot with Regression Line)
plt.figure(figsize=(10, 6))
sns.regplot(x='Quantity', y='TotalPrice', data=data_cleaned)
plt.title("Quantity vs Total Price")
plt.xlabel("Quantity")
plt.ylabel("Total Price")
plt.show()
# 8. Quantity Sold vs Revenue per Customer (Scatter Plot)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='RevenuePerCustomer', y='Quantity', data=data_cleaned)
plt.title("Quantity Sold vs Revenue per Customer")
plt.xlabel("Revenue per Customer")
plt.ylabel("Quantity Sold")
plt.show()
plt.figure(figsize=(12, 8))
sns.heatmap(data_cleaned[['TotalPrice', 'ProfitMargin', 'RevenuePerCustomer', 'ItemsPerInvoice', 'Quantity', 'LineProfit']].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix of Key Financial Variables")
plt.show()
# 10. Total Sales by Customer (Interactive Bar Chart)
fig1 = px.bar(sales_by_customer, x='CustomerID', y='TotalSales', title="Total Sales by Customer")
fig1.show()
# 11. Average Profit Margin by Customer Category (Interactive Pie Chart)
fig2 = px.pie(avg_profit_margin_category, names='CustomerCategoryID', values='AverageProfitMargin', title="Average Profit Margin by Customer Category")
fig2.show()
# 12. Sales Contribution per Region (Donut Chart)
fig4 = px.pie(sales_contribution_region, names='CountryID', values='SalesContribution', title="Sales Contribution by Region", hole=0.4)
fig4.show()
# 13. Top 5 Products by Total Sales (Horizontal Bar Chart)
fig5 = px.bar(top_products_sales, x='TotalSales', y='StockItemID', title="Top 5 Products by Total Sales", orientation='h')
fig5.show()
# 14. Cumulative Sales by Customer (Scatter Plot)
fig6 = px.scatter(data_cleaned, x='customerID', y='CumulativeSalesCustomer', title="Cumulative Sales by Customer")
fig6.show()
# 15. Profit per Product by Region (Treemap)
fig7 = px.treemap(product_performance_region, path=['CountryID', 'StockItemID'], values='TotalSales', title="Profit per Product by Region")
fig7.show()
# 16. Average Quantity Sold per Product (Box Plot)
fig8 = px.box(data_cleaned, x='StockItemID', y='Quantity', title="Average Quantity Sold per Product")
fig8.show()
# 17. Average Quantity Sold and Average Unit Price per Product (Bubble Chart)
fig11 = px.scatter(data_cleaned, x='AvgUnitPriceProduct', y='AvgQuantityProduct', size='AvgQuantityProduct', color='StockItemID', title="Average Quantity Sold and Unit Price per Product")
fig11.show()